qPCR data QC

Author

Laura Symul & Laura Vermeren

Published

May 19, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)

tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Loading the data

Code
data_source <- "real"
qpcr_dir <- str_c(get_data_dir(data_source), "00 Raw/03 qPCR/")
file <- "VIBRANT_qPCR_data_merged_250416.csv"
cat("We load the file", file, "\n\n")
We load the file VIBRANT_qPCR_data_merged_250416.csv 
Code
qpcr_LBP <- read_csv(str_c(qpcr_dir, file))
file <- "VIBRANT_16SqPCR_20250514.csv"
cat("And the file ", file, "\n\n")
And the file  VIBRANT_16SqPCR_20250514.csv 
Code
qpcr_16S <- read_csv(str_c(qpcr_dir, file))

rm(qpcr_dir, file)

The data is provided in long format:

Code
qpcr_LBP |> glimpse()
Rows: 51,480
Columns: 20
$ Data_row                 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ Well                     <chr> "A02", "A03", "A04", "A05", "A06", "A07", "A0…
$ Fluor                    <chr> "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy…
$ Content                  <chr> "Unkn-01", "Unkn-01", "Unkn-01", "Unkn-02", "…
$ Replicate                <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, …
$ Sample                   <chr> "A1", "A1", "A1", "A2", "A2", "A2", "A3", "A3…
$ Cq                       <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, …
$ `Starting Quantity (SQ)` <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ `Cq Mean`                <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.000…
$ plate_id                 <chr> "B064152", "B064152", "B064152", "B064152", "…
$ Plate_number             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ gDNA_Plate_ID            <dbl> 2240890, 2240890, 2240890, 2240890, 2240890, …
$ VMRC_Group               <chr> "VMRC_3", "VMRC_3", "VMRC_3", "VMRC_3", "VMRC…
$ Group_Fluor              <chr> "VMRC_3_Cy5", "VMRC_3_Cy5", "VMRC_3_Cy5", "VM…
$ Target                   <chr> "185329", "185329", "185329", "185329", "1853…
$ gDNA_Plate_Well          <chr> "2240890_A1", "2240890_A1", "2240890_A1", "22…
$ VIBR_Sample_ID           <chr> "2152456", "2152456", "2152456", "2157507", "…
$ Dilution_Factor          <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3…
$ Quant_Adjusted           <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ Copies_per_swab          <dbl> 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, …

Merging 16S and LBP qPCR data

Since the two files have exactly the same columns, we merge them into a single file after some harmonization.

First, we notice that there is only one clinical sample that has data for the LBP qPCR, but not for the 16S qPCR.

Code
full_join(
  qpcr_16S |> select(VIBR_Sample_ID, gDNA_Plate_ID) |> mutate(has_16S = TRUE) |> distinct(),
  qpcr_LBP |> select(VIBR_Sample_ID, gDNA_Plate_ID) |> mutate(has_LBP = TRUE) |> distinct()
) |>  filter(is.na(has_16S) | is.na(has_LBP)) |> 
  filter(!is.na(VIBR_Sample_ID), !(VIBR_Sample_ID %in% c("#N/A", "empty"))) 
# A tibble: 1 × 4
  VIBR_Sample_ID gDNA_Plate_ID has_16S has_LBP
  <chr>                  <dbl> <lgl>   <lgl>  
1 2171048              2240890 NA      TRUE   

We also note that there is one sample on the same plate that has 6 replicates instead of 3 in the 16S qPCR.

Code
qpcr_16S |> filter(!is.na(VIBR_Sample_ID)) |> dplyr::count(VIBR_Sample_ID, gDNA_Plate_ID ) |> filter(n > 3)
# A tibble: 1 × 3
  VIBR_Sample_ID gDNA_Plate_ID     n
  <chr>                  <dbl> <int>
1 2167612              2240890     6
Warning

@Michael: do you think this could be a mislabeling?

Code
# We rename the standards in the 16S qPCR so they are numbered
std_16s <-
  qpcr_16S |> 
  filter(str_detect(Content, "Std")) |> 
  select(Content, `Starting Quantity (SQ)`) |> 
  distinct() |> 
  arrange(`Starting Quantity (SQ)` |> desc()) |>
  mutate(std_nb = row_number())
  
qpcr_16S <- 
  qpcr_16S |> 
  left_join(std_16s, by = join_by(Content, `Starting Quantity (SQ)`)) |> 
  mutate(
    Content = Content |> str_c("-", std_nb |> str_pad(width = 2, pad = "0")),
    Target = "16S"
        ) |> 
  select(-std_nb)

# We transform the VIBR_Sample_ID column to be consistent with the 16S qPCR
qpcr_LBP <- 
  qpcr_LBP |> 
  mutate(
    VIBR_Sample_ID = ifelse(VIBR_Sample_ID == "#N/A", NA_character_, VIBR_Sample_ID)
  )

qpcr <- 
  bind_rows(
    qpcr_LBP |> mutate(target_type = "LBP"), 
    qpcr_16S |> mutate(target_type = "16S rRNA")
    ) |> 
  relocate(target_type, .after = Target)
Code
dictionary <- 
  tibble(original_name = colnames(qpcr)) |> 
  mutate(
    description = 
      case_when(
        (original_name == "Data_row") ~ 'Table row number',
        (original_name == "Well") ~ str_c("qPCR plate well ID. From ", min(qpcr$Well), " to ", max(qpcr$Well)),
        (original_name == "Fluor") ~ str_c('fluorescent dye used (one of ', qpcr$Fluor |> unique() |> str_c(collapse = ", "), ")"),
        (original_name == "Content") ~ 'Indicates whether well contains a sample, a standard, or a negative control.',
        (original_name == "Replicate") ~ 
          str_c(
            "The replicate number from ", min(qpcr$Replicate, na.rm = TRUE), " to ", max(qpcr$Replicate, na.rm = TRUE), 
            " (with ", sum(is.na(qpcr$Replicate)), " missing values). Values are repeated 495, or 990 times."
          ),
        (original_name == "Sample") ~ 'A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses.',
        (original_name == "Cq") ~ 'Quantification cycle (Cq) value.',
        (original_name == "Starting Quantity (SQ)") ~ 'Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards).',
        (original_name == "Cq Mean") ~ 'Mean across replicates',
        (original_name == "plate_id") ~ str_c('qPCR plate ID; there are ', qpcr$plate_id |> unique() |> length()," unique plate IDs in total."),
        (original_name == "Plate_number") ~ str_c("The extraction plate number; there are ", qpcr$Plate_number |> unique() |> length()," unique plate numbers in total (5 identical `plate_id` per `Plate_number`)."),
        (original_name == "gDNA_Plate_ID") ~ 'The extraction plate ID',
        (original_name == "VMRC_Group") ~ str_c('The group of the LBP strains. There are ', qpcr$VMRC_Group |> unique() |> na.omit() |> length()," unique VMRC groups in total (", qpcr$VMRC_Group |> unique() |> na.omit() |> sort() |> str_c(collapse = ", "), ")"),
        (original_name == "Group_Fluor") ~ 'Concatenation of the `VMRC_Group` and `Fluor` columns',
        (original_name == "Target") ~ str_c('The target "gene" (here taxa/strain). There are ', qpcr$Target |> unique() |> length(), " unique targets in total (", qpcr$Target |> unique() |> str_c(collapse = ", "),")"),
        (original_name == "target_type") ~ "Whether the target is a LBP strain or the 16S rRNA gene target",
        (original_name == "gDNA_Plate_Well") ~ str_c('The concatenation of the VIBRANT swab barcode and the gDNA plate well ID; there are ', qpcr$gDNA_Plate_Well |> unique() |> length()," unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., ", qpcr$gDNA_Plate_Well[10],")"),
        (original_name == "VIBR_Sample_ID") ~ str_c("VIBRANT sample ID; there are ", qpcr$VIBR_Sample_ID |> unique() |> length(), " unique VIBRANT sample IDs in total)"),
        (original_name == "Dilution_Factor") ~ str_c('The qPCR plate specific dilution factor. There are ', qpcr$Dilution_Factor |> unique() |> length()," unique dilution factors in total (", qpcr$Dilution_Factor |> unique()  |> sort() |> str_c(collapse = ", "),")"),
        (original_name == "Quant_Adjusted") ~ str_c("Quantification value adjusted by the dilution factor; range from ", min(qpcr$Quant_Adjusted, na.rm = TRUE)," to ", max(qpcr$Quant_Adjusted, na.rm = TRUE)),
        (original_name == "Copies_per_swab") ~ str_c("The number of target copies per swab, computed from the adjusted quantification values. Ranges from ", min(qpcr$Copies_per_swab, na.rm = TRUE)," to ", max(qpcr$Copies_per_swab, na.rm = TRUE)),
        TRUE ~ "????"
      )
  )
Code
# dictionary |> gt()

Tidy names

We tidy the column names for consistency and readability.

Code
qpcr <- qpcr |> janitor::clean_names()
qpcr <- qpcr |> 
  dplyr::rename(
    strain_group_qpcr = vmrc_group,
    starting_quantity = starting_quantity_sq,
    pcr_plate_id = plate_id,
    ext_lib_plate_nb = plate_number,
    ext_lib_plate_id = g_dna_plate_id,
    ext_lib_position = g_dna_plate_well
  )
Code
dictionary <- 
  dictionary |> 
  mutate(name = colnames(qpcr)) |> 
  select(name, everything())

The new column names are:

Code
dictionary |> 
  gt() |> 
  cols_label(
    name = "New column name",
    original_name = "Original column name",
    description = "Description"
  ) 
New column name Original column name Description
data_row Data_row Table row number
well Well qPCR plate well ID. From A01 to P24
fluor Fluor fluorescent dye used (one of Cy5, FAM, HEX, SYBR)
content Content Indicates whether well contains a sample, a standard, or a negative control.
replicate Replicate The replicate number from 1 to 96 (with 3927 missing values). Values are repeated 495, or 990 times.
sample Sample A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses.
cq Cq Quantification cycle (Cq) value.
starting_quantity Starting Quantity (SQ) Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards).
cq_mean Cq Mean Mean across replicates
pcr_plate_id plate_id qPCR plate ID; there are 66 unique plate IDs in total.
ext_lib_plate_nb Plate_number The extraction plate number; there are 11 unique plate numbers in total (5 identical `plate_id` per `Plate_number`).
ext_lib_plate_id gDNA_Plate_ID The extraction plate ID
strain_group_qpcr VMRC_Group The group of the LBP strains. There are 5 unique VMRC groups in total (VMRC_1, VMRC_2, VMRC_3, VMRC_4, VMRC_5)
group_fluor Group_Fluor Concatenation of the `VMRC_Group` and `Fluor` columns
target Target The target "gene" (here taxa/strain). There are 16 unique targets in total (185329, C0028A1, C0112A1, FF00004, C0006A1, 122010, UC101, FF00018, FF00051, UC119, FF00064, FF00072, C0175A1, C0022A1, C0059E1, 16S)
target_type target_type Whether the target is a LBP strain or the 16S rRNA gene target
ext_lib_position gDNA_Plate_Well The concatenation of the VIBRANT swab barcode and the gDNA plate well ID; there are 1155 unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., 2240890_A4)
vibr_sample_id VIBR_Sample_ID VIBRANT sample ID; there are 1016 unique VIBRANT sample IDs in total)
dilution_factor Dilution_Factor The qPCR plate specific dilution factor. There are 4 unique dilution factors in total (5, 10, 20, 30)
quant_adjusted Quant_Adjusted Quantification value adjusted by the dilution factor; range from 0 to 3e+08
copies_per_swab Copies_per_swab The number of target copies per swab, computed from the adjusted quantification values. Ranges from 0 to 7.5e+10

Sample types and unique sample IDs

We define a qpcr_sample_type variable to summarize information from vibr_sample_id and sample columns.

Code
qpcr <-  
  qpcr |> 
  mutate(
    qpcr_sample_type = 
      case_when(
        is.na(vibr_sample_id) & str_detect(content, "Std") ~ "Standard",
        is.na(vibr_sample_id) & str_detect(sample, "water") ~ "Water",
        str_detect(vibr_sample_id, "EQ") ~ "Control sample",
        str_detect(vibr_sample_id, "empty") ~ "Empty",
        is.na(vibr_sample_id) ~ "Water or empty",
        TRUE ~ "VIBRANT clinical sample"
      )
  )
Code
# qpcr |> filter(target == "16S") |> arrange(qpcr_sample_type) |> View()
Code
qpcr |> dplyr::count(qpcr_sample_type, name = "n_wells") |> arrange(-n_wells) |> gt()
qpcr_sample_type n_wells
VIBRANT clinical sample 46560
Standard 3729
Control sample 2112
Empty 1890
Water 495
Water or empty 126
Code
tmp <- 
  qpcr |> 
  filter(qpcr_sample_type == "VIBRANT clinical sample", target == target[1]) |>
  group_by(vibr_sample_id) |> 
  summarize(
    n_replicates = n(),
    n_plates = ext_lib_plate_nb |> unique() |> length()
  )

# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate

# tmp |> dplyr::count(n_replicates) # All samples have 3 replicates

We note that all samples’s replicates are on a single extraction plate (ext_lib_plate_nb or ext_lib_plate_id); and all samples have the same number of replicates (3).

Code
tmp <- 
  qpcr |> 
  filter(qpcr_sample_type == "VIBRANT clinical sample", target == "16S") |>
  group_by(vibr_sample_id) |> 
  summarize(
    n_replicates = n(),
    n_plates = ext_lib_plate_nb |> unique() |> length()
  )

# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate

# tmp |> dplyr::count(n_replicates) # All samples have 3 replicates
Caution

As noted above, in the 16S qPCR, there is one sample that has 6 replicates:

Code
tmp |> 
  filter(n_replicates == 6) |> 
  left_join(qpcr |> filter(target == "16S"), by = join_by(vibr_sample_id)) |> 
  select(vibr_sample_id, ext_lib_plate_nb, ext_lib_position, pcr_plate_id, data_row, well, cq) |> 
  gt()
vibr_sample_id ext_lib_plate_nb ext_lib_position pcr_plate_id data_row well cq
2167612 1 2240890_E4 b062862 163 I08 22.40223
2167612 1 2240890_E4 b062862 178 J07 22.42773
2167612 1 2240890_E4 b062862 179 J08 22.37871
2167612 1 2240890_G5 b062862 242 M10 22.61241
2167612 1 2240890_G5 b062862 258 N09 22.52760
2167612 1 2240890_G5 b062862 259 N10 22.76733
Code
qpcr <- 
  qpcr |> 
  mutate(
    well_col = well |> str_sub(1, 1), 
    well_row = well |> str_sub(2,3) |> as.integer()
    ) |> 
  relocate(well_col, well_row, .after = well)

We also define a unique “qPCR sample ID” according to each sample type.

Code
qpcr <- 
  qpcr |> 
  mutate(
    qpcr_sample_id = 
      case_when(
        (qpcr_sample_type == "VIBRANT clinical sample") ~ vibr_sample_id,
        (qpcr_sample_type == "Control sample") ~ vibr_sample_id,
        (qpcr_sample_type == "Empty") ~ 
          str_c(vibr_sample_id,"_plate_", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
        (qpcr_sample_type == "Water") ~ 
          str_c("water_plate", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
        (qpcr_sample_type == "Water or empty") ~ 
          str_c("water_or_empty_plate", 
                ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
        (qpcr_sample_type == "Standard") ~ 
          str_c("std_", content |> str_remove("Std-"), 
                "_plate_", ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
        TRUE ~ NA_character_
      )
  ) |> 
  group_by(qpcr_sample_id, target) |>
  arrange(well_col, well_row) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  mutate(qpcr_uid = str_c(qpcr_sample_id, "_r", replicate_nb))
Code
# qpcr |> filter(replicate_nb > 3) |> View()

For the LBP strains, the plate layout is as follows:

Code
qpcr |> 
  filter(target != "16S") |> 
  select(ext_lib_plate_nb, well_col, well_row, qpcr_sample_type, sample) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type) +
  geom_tile() +
  geom_path(aes(group = sample), col = "black", alpha = 0.5) +
  geom_point() +
  facet_wrap(~ ext_lib_plate_nb) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

For the 16S plates, the layout is a little different:

Code
qpcr |> 
  filter(target == "16S") |> 
  select(ext_lib_plate_nb, pcr_plate_id, well_col, well_row, qpcr_sample_type, qpcr_sample_id) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type, col = qpcr_sample_type) +
  geom_tile(alpha = 0.25) +
  geom_path(aes(group = qpcr_sample_id), alpha = 0.5, linewidth = 2) +
  geom_point() +
  facet_wrap(~ ext_lib_plate_nb + pcr_plate_id) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Lines connect replicates") 

Manifest data

We merge the qPCR data with the metagenomics manifest data to add the participant ID (pid) and (visit_code).

Code
SE_mg <- 
  readRDS(
    list.files(
      path = get_output_dir(data_source = data_source), 
      pattern = "02_se_mg_.*\\.rds$", full.names = TRUE
    ) |> 
      sort(decreasing = TRUE) |>
      extract(1) 
  )
Code
qpcr <- 
  qpcr |> 
  left_join(
    SE_mg |>
      colData() |> as.data.frame() |> as_tibble() |> 
      select(
        swab_barcode,
        uid, mg_sample_id, 
        mg_pid, pid, visit_code, location,
        sample_type, control_type, sample_category
      ) |> 
      distinct() ,
    by = join_by("qpcr_sample_id" == "swab_barcode")
  )
Code
samples_without_entries_in_mg_manifest <- 
  qpcr |> 
  filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) |>
  filter(is.na(uid)) |> 
  select(qpcr_sample_id, qpcr_sample_type) |> 
  distinct()  |> 
  arrange(qpcr_sample_type, qpcr_sample_id) 

There are 48 clinical or vibrant positive/negative control samples without a matching entry in the metagenomics manifest.

Code
samples_without_entries_in_mg_manifest |> 
  dplyr::count(qpcr_sample_type) |>
  gt()
qpcr_sample_type n
Control sample 26
VIBRANT clinical sample 22

TODO: export for Michael?

For these samples, the current uid is missing. We change it to the qpcr_sample_id when NA.

Code
qpcr <- 
  qpcr |> 
  mutate(
    uid = case_when(
      is.na(uid) & 
        (qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) ~ 
        qpcr_sample_id,
      TRUE ~ uid
    )
  )
Code
mg_samples <- 
  SE_mg |>
  colData() |> as.data.frame() |> as_tibble() |> 
  select(
    swab_barcode,
    uid, mg_sample_id, 
    mg_pid, pid, visit_code, 
    sample_type, control_type, sample_category
  ) |> 
  distinct() |> 
  left_join(
    qpcr |> 
      select(qpcr_sample_id, qpcr_sample_type) |> 
      distinct() |> 
      filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")),
    by = join_by("swab_barcode" == "qpcr_sample_id")
  )

# mg_samples |> filter(is.na(qpcr_sample_type)) |> nrow()

There are 0 samples for which we have metagenomics data but no qPCR data.

Code
rm(mg_samples, samples_without_entries_in_mg_manifest, tmp)

Target’s plate layout and fluorescent probes

The plate x Fluorescence x Target layout is as follows:

Code
qpcr |> 
  dplyr::count(fluor, target, strain_group_qpcr, pcr_plate_id, ext_lib_plate_nb) |> 
  arrange(ext_lib_plate_nb, strain_group_qpcr, pcr_plate_id, fluor) |> 
  mutate(
    target = target |> fct_inorder(), 
    plate_nb = str_c("ext. plate: ", ext_lib_plate_nb) |> fct_inorder(),
    pcr_plate_id = pcr_plate_id |> fct_inorder()
    ) |> 
  ggplot() +
  aes(
    x = pcr_plate_id,
    y = target |> factor() |> fct_rev(),
    fill = fluor
  ) +
  geom_tile() +
  facet_grid(strain_group_qpcr ~ plate_nb, scales = "free", space = "free") +
  xlab("qPCR plate ID") +
  ylab("Target") +
  scale_fill_manual(
    name = "Fluor.",
    values = c("FAM" = "dodgerblue1", "HEX" = "green3", "Cy5" = "#F8766D", "SYBR" = "green")
  ) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

Potential batch effects due to extraction should be checked for ext_lib_plate_id and those due to the qPCR itself using the pcr_plate_id.

LBP strain information

We also add the LBP strain information to the qPCR data.

Code
qpcr <- 
  qpcr |> 
  left_join(
    SE_mg |>
      rowData() |> as.data.frame() |> as_tibble() |> 
      filter(!is.na(LBP)) |> 
      select(taxa, taxon_label, LBP, strain_id, strain_origin, biose_id) |> 
      dplyr::rename(target = taxa),
    by = join_by(target)
  )
Code
qpcr <- 
  qpcr |> 
  mutate(taxon_label = taxon_label |> str_replace_na("16S rRNA gene target"))
Code
rm(SE_mg)
Code
qpcr |> 
  arrange(strain_group_qpcr, strain_origin) |>
  mutate(target = target |> fct_inorder()) |> 
  ggplot() +
  aes(x = target, y = strain_group_qpcr, col = strain_group_qpcr) +
  geom_point() +
  facet_grid(. ~ LBP + strain_origin, scales = "free", space = "free")  +
  xlab("Target") + ylab("VMRC group") +
  guides(col = "none") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) 

Relationships between columns

Observations and calibration curves

Observed values are the cq values.

starting_quantity are computed from the cq values using the calibration curves fitted on the standard values.

\[C = f(SQ^*)\]

where \(C\) is the observed cq value for the standard, \(SQ^*\) is the known starting quantity starting_quantity for the standards, and \(f\) is the calibration function.

Note

I assume that, either the standards are not diluted, and that the quant_adjusted and copies_per_swabs are not correct for the standards, or that the standards are diluted and that the calibration curves are fitted on the quant_adjusted:

Code
qpcr |>
  filter(qpcr_sample_type == "Standard", target != "16S") |> 
  select(
    sample, starting_quantity, dilution_factor, quant_adjusted, copies_per_swab
  ) |> 
  distinct() |> 
  arrange(sample, dilution_factor) |> 
  gt()
sample starting_quantity dilution_factor quant_adjusted copies_per_swab
std 10^1 1e+01 5 5e+01 1.25e+04
std 10^1 1e+01 10 1e+02 2.50e+04
std 10^1 1e+01 20 2e+02 5.00e+04
std 10^1 1e+01 30 3e+02 7.50e+04
std 10^2 1e+02 5 5e+02 1.25e+05
std 10^2 1e+02 10 1e+03 2.50e+05
std 10^2 1e+02 20 2e+03 5.00e+05
std 10^2 1e+02 30 3e+03 7.50e+05
std 10^3 1e+03 5 5e+03 1.25e+06
std 10^3 1e+03 10 1e+04 2.50e+06
std 10^3 1e+03 20 2e+04 5.00e+06
std 10^3 1e+03 30 3e+04 7.50e+06
std 10^4 1e+04 5 5e+04 1.25e+07
std 10^4 1e+04 10 1e+05 2.50e+07
std 10^4 1e+04 20 2e+05 5.00e+07
std 10^4 1e+04 30 3e+05 7.50e+07
std 10^5 1e+05 5 5e+05 1.25e+08
std 10^5 1e+05 10 1e+06 2.50e+08
std 10^5 1e+05 20 2e+06 5.00e+08
std 10^5 1e+05 30 3e+06 7.50e+08
std 10^6 1e+06 5 5e+06 1.25e+09
std 10^6 1e+06 10 1e+07 2.50e+09
std 10^6 1e+06 20 2e+07 5.00e+09
std 10^6 1e+06 30 3e+07 7.50e+09
std 10^7 1e+07 5 5e+07 1.25e+10
std 10^7 1e+07 10 1e+08 2.50e+10
std 10^7 1e+07 20 2e+08 5.00e+10
std 10^7 1e+07 30 3e+08 7.50e+10
Code
qpcr |>
  filter(qpcr_sample_type == "Standard") |> 
  arrange(-dilution_factor) |>
  ggplot() +
  aes(
    # x = starting_quantity |> log10(), 
    x = quant_adjusted |> log10(),
    y = cq, 
    col = dilution_factor |> factor()
  ) +
  geom_point(size = 1, alpha = 0.2) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("black", "dodgerblue1"))(4)
    ) +
  facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) 

For all clinical samples, the starting_quantity is computed from the cq values using the calibration curve fitted on the standard values.

\[\hat{SQ} = f^{-1}(C)\]

where \(\hat{SQ}\) is the estimated starting quantity starting_quantity for the clinical samples, \(C\) is the observed cq value for the clinical sample, and \(f^{-1}\) is the inverse of the calibration function.

If \(C\) is not observed (i.e., assumed to be larger than the total number of cycles), then \(\hat{SQ}\) is estimated as 0.

Code
# qpcr |> 
#   ggplot() +
#   aes(x = starting_quantity |> log10(), y = cq, col = qpcr_sample_type) +
#   geom_point(size = 0.5, alpha = 0.4) +
#   facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb) 


qpcr |> 
  ggplot() +
  aes(
    x = starting_quantity |> log10(), 
    y = cq, 
    col = ext_lib_plate_nb |> factor(),
    size = (qpcr_sample_type == "Standard")
    ) +
  geom_point(alpha = 0.5) +
  facet_wrap(strain_group_qpcr + target ~ ., ncol = 6) +
  scale_size_manual("Standard", values = c(0.2, 1)) +
  scale_color_viridis_d("Extraction plate number", direction = -1, option = "A", end = 0.8) 

It looks like \(f\) is a linear function of \(\log(SQ^*)\): \(C = \beta_0 + \beta_1 \log(SQ^*)\). This seems appropriate for the LBP strains, but for the total 16S, the standard curves are not linear. Probably not too much of an issue and unlikely to create huge batch effects.

Adjusted quantities

Adjusted quantities (quant_adjusted) are then computed from the estimated starting_quantity values using the dilution factors.

\(\hat{Q} = \hat{SQ} \times d\)

where \(\hat{Q}\) is the estimated adjusted quantity (quant_adjusted), \(\hat{SQ}\) is the estimated starting quantity (starting_quantity) for the clinical samples, and \(d\) is the dilution factor.

Code
qpcr |>
  ggplot() +
  aes(
    x = starting_quantity |> log10(), 
    y = quant_adjusted |> log10(), 
    col = dilution_factor |> factor()
    ) +
  geom_point() +
  facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
    )

quant_adjusted = starting_quantity x dilution_factor:

Code
qpcr |>
  ggplot() +
  aes(
    x = (starting_quantity * dilution_factor) |> log10(), 
    y = (quant_adjusted) |> log10(), 
    col = dilution_factor |> factor()
    ) +
  geom_point() +
  facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) +
  scale_color_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
    )

Copies per swab

copies_per_swab are the adjusted_quant multiplied by a constant factor estimated to be the number of bacteria in 1 concentration unit.

Here, that number is 250 for all LBP targets.

Code
# strain_group_qpcr + target

qpcr |>
  filter(target != "16S") |> 
  ggplot() +
  aes(
    x = (quant_adjusted * 250) |> log10(), 
    y = copies_per_swab |> log10(), 
    col =  target
    ) +
  geom_abline(intercept = 0, slope = 1, col = "gray") +
  geom_point(size = 0.5, alpha = 0.4) +
  facet_wrap(. ~ ext_lib_plate_nb , ncol = 6) 

For the 16S target, that number is 500.

Code
qpcr |>
  filter(target == "16S") |> 
  mutate(
    r = copies_per_swab / (quant_adjusted )
  ) |> 
  pull(r) |> 
  hist()

Dilution factors

Dilution factors are defined per qPCR plate (and consequently, are the same for the 3 targets in the same VMRC group)

Code
qpcr |> 
  dplyr::count(target, dilution_factor) 

qpcr |> 
  group_by(pcr_plate_id, target) |>
  summarize(n_dilution_factors = dilution_factor |> unique() |> length()) 
Code
#|

qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    dilution_factor
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = dilution_factor |> factor()
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_manual(
    "Dilution factor", 
    values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
    ) +
  xlab("Well column") + ylab("Well row") 

Exploratory & QC analyses

CQ and Copies per swabs per well

Code
qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    target,
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    cq
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Code
qpcr |> 
  select(
    starts_with("well"), 
    pcr_plate_id, 
    target,
    strain_group_qpcr, 
    ext_lib_plate_nb, 
    copies_per_swab
    ) |> 
  distinct() |> 
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = copies_per_swab |> asinh()
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Warning

From these visualizations of the raw data, something looks weird with the cq and copies_per_swab values for the first 3 plates of “VMRC group 1”.

Total number of copies per swab per sample type

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
  geom_boxplot(alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ target_type, scales = "free", space = "free")

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(qpcr_sample_type ~ ., scales = "free") +
  guides(fill = "none", color = "none") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  ) +
  facet_grid(. ~ target_type, scales = "free", space = "free")

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = ext_lib_plate_nb |> factor(), y = copies_per_swab, col = target, fill = target) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(qpcr_sample_type ~ ., scales = "free") +
  xlab("Extraction plate number") +
  theme(
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90, hjust = 0)
  )

g 

Code
g + scale_y_log10() 

Empty and water samples

From the plots above, it looks like the empty and water samples have non-zero values on a few LBP plates.

Code
plot_qpcr_empties <- function(qpcr, color_by = "qpcr_sample_type") {
  qpcr |> 
    filter(qpcr_sample_type %in% c("Empty", "Water")) |> 
    group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
    mutate(replicate_nb = row_number()) |> 
    ungroup() |> 
    ggplot() +
    aes(x = sample, y = copies_per_swab, label = replicate_nb, col = !!sym(color_by)) +
    geom_text(size = 3) +
    facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      strip.text.y = element_text(angle = 0)
    ) 
}
Code
plot_qpcr_empties(qpcr, color_by = "qpcr_sample_type") 

Code
plot_qpcr_empties(qpcr |> mutate(zero_copies = (copies_per_swab == 0)), color_by = "zero_copies") 

Code
prob_target <-
  qpcr |> 
  select(target) |> 
  distinct() |> 
  mutate(prob_target = (target %in% c("C0022A1", "C0059E1", "C0175A1")))
Warning

We see the same issues as above for the first 3 plates of VMRC group 1.

Control samples

Code
qpcr |> 
  filter(qpcr_sample_type %in% c("Control sample")) |> 
  group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  mutate(
    control_type = control_type |> str_replace_na("? Unknown (not in MG manifest)"),
    vibr_sample_id = vibr_sample_id |> fct_reorder(control_type),
    plate_nb = str_c("ext. plate\n", ext_lib_plate_nb) |> fct_reorder(ext_lib_plate_nb),
    ) |> 
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = control_type) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("Control type") +
  facet_grid(strain_group_qpcr + target ~ plate_nb, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    # strip.text.x = element_text(angle = 90),
    strip.text.y = element_text(angle = 0)
  ) 

Warning

I am surprise to see that almost all Mock 1 samples are positive (copies per swabs >>) because, these samples were supposed to be pooled samples from the CAP084 study. I don’t think that participants of that study received the LBP. It could also be that I made wrong assumptions when matching the vibr_sample_id with the metagenomics manifest data.

The “negative controls” look better (= more consistent with expectations), except, again, for the first 3 plates of VMRC group 1.

Standards

Code
qpcr |> 
  filter(qpcr_sample_type %in% "Standard") |>
  ggplot() +
  aes(x = starting_quantity, y = cq, col = replicate_nb |> factor()) +
  geom_point(alpha = 0.5, size = 0.5) +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb) +
  scale_x_log10() +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

For the 16S, there seems to be “reverse saturation” (i.e., the cq is lower than expected with a linear relationship). Maybe the dilution was not that reliable at such low concentrations.

Code
qpcr |> 
  filter(qpcr_sample_type %in% "Standard") |>
  mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |> 
  ggplot() +
  aes(x = starting_quantity, y = cq, col = dilution_factor |> factor()) +
  geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
  geom_text(alpha = 0.2, aes(label = std_nb)) +
  facet_wrap(strain_group_qpcr + target ~ ., ncol = 9) +
  scale_x_log10() +
  scale_color_manual(
    name = "Dilution factor",
    breaks = c(5,10,20,30),
    values = c("black", "deeppink3", "deeppink1", "pink")
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

It looks like standard 4 was not prepared or placed properly for VMRC group 4.

Code
qpcr |> 
  filter(qpcr_sample_type %in% "Standard") |>
  mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |> 
  ggplot() +
  aes(x = starting_quantity, y = cq, col = ext_lib_plate_nb |> factor()) +
  geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
  geom_text(alpha = 0.2, aes(label = std_nb)) +
  facet_wrap(strain_group_qpcr + target ~ ., ncol = 9) +
  scale_x_log10() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

This shows again the issues with the first 3 plates of VMRC group 1.

Clinical samples

Code
qpcr |> 
  filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(vibr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab, na.rm = TRUE),
    mean_cps = mean(copies_per_swab, na.rm = TRUE)
    ) |> 
  ungroup() |> 
  mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_line(aes(group = vibr_sample_id), col = "black", linewidth = 0.1) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

We notice again the same issues with the VRMC group 1 plate 1-3.

Warning

We could simply discard these values (exclude these targets x plates) or try to identify a new threshold to differentiate between background noise and actual signal.

New detection thresholds ?

The idea is to use the negative controls cq values to define plate- and target-specific “detection” thresholds, then use these thresholds to set to 0 the copies_per_swab values for which the cq values are above the threshold.

The figures below show the cq values for the VMRC group 1 and 2.

Code
qpcr |>
  filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

We impute the missing cq values to 45 (well above the maximum observed value, and what I assume was the total number of cycles).

Code
qpcr <- qpcr |> mutate(cq_imp = cq |> replace_na(45)) 
Code
qpcr |>
  filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
  ggplot() +
  aes(
    x = well_row |> factor(), 
    y = well_col |> fct_rev(), 
    fill = cq_imp
    ) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
  xlab("Well column") + ylab("Well row") 

Comparing the cq values per sample type

Code
qpcr <- 
  qpcr |>
  mutate(
    simplified_type = 
      case_when(
        str_detect(control_type, "Mock") ~ "Positive control",
        (qpcr_sample_type == "Control sample") & is.na(sample_type) ~ "Unknown control",
        qpcr_sample_type %in% c("Control sample", "Empty", "Water") ~ "Negative control",
        TRUE ~ qpcr_sample_type
      ) |> 
      fct_infreq(),
    qpcr_sample_type = qpcr_sample_type |> fct_infreq()
    ) 

# qpcr |> dplyr::count(simplified_type, sample_type, control_type, qpcr_sample_type)

We define the thresholds as the average over the two smallest cq value for the “empties” (i.e., empty and water samples, as well as the negative controls).

Code
thresholds <- 
  qpcr |> 
  filter(target != "16S") |> 
  group_by(target, strain_group_qpcr, ext_lib_plate_nb) |> 
  summarize(
    min_cq_empties = min(cq_imp[simplified_type == "Negative control"], na.rm = TRUE),
    robust_min_cq_empties = cq_imp[simplified_type == "Negative control"] |> sort() |> extract(1:2) |> mean(),
    max_cq_std = max(cq_imp[simplified_type == "Standard"], na.rm = TRUE),
    .groups = "drop"
  ) |> 
  mutate(
    threshold_cq = robust_min_cq_empties
  )
Code
qpcr |> 
  filter(
    strain_group_qpcr %in% c("VMRC_1", "VMRC_2"), 
    ext_lib_plate_nb <= 6,
    simplified_type %in% c("VIBRANT clinical sample", "Standard", "Negative control") 
    ) |> 
  arrange(simplified_type) |> 
  ggplot() +
  aes(x = simplified_type |> str_wrap(width = 15), y = cq_imp, col = qpcr_sample_type) + #  
  geom_violin(col = "gray") +
  geom_jitter(height = 0, width = 0.25, alpha = 0.5, size = 0.75) +
  geom_hline(
    data = thresholds |> filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2"), ext_lib_plate_nb <= 6), 
    aes(yintercept = threshold_cq), col = "steelblue1"
    ) +
  facet_grid(target ~ ext_lib_plate_nb) +
  scale_color_manual(values = c("steelblue1", "black", "dodgerblue3", "red","pink") |> rev()) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) +
  xlab("Sample type") 

Code
qpcr <- 
  qpcr |> 
  select(-any_of(c("min_cq_empties","robust_min_cq_empties", "max_cq_std", "threshold_cq"))) |> 
  left_join(thresholds, by = join_by(ext_lib_plate_nb, strain_group_qpcr, target)) 

For all plates and targets

Code
qpcr |>
  filter(qpcr_sample_type == "VIBRANT clinical sample") |> 
  ggplot() +
  aes(x = cq_imp, y = target) +
  geom_violin(fill = "gray90", col = "transparent") +
  geom_jitter(width = 0, size = 0.1, col = "pink") +
  geom_vline(aes(xintercept = min_cq_empties), col = "dodgerblue") +
  geom_vline(aes(xintercept = robust_min_cq_empties), col = "green3") +
  geom_vline(aes(xintercept = max_cq_std), col = "red") +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free") +
  theme(strip.text.y = element_text(angle = 0)) +
  labs(
    x = "cq value (imputed to 45 when not avalaible)",
    caption = 
      str_c(
        "Pink dots are the cq values for the VIBRANT clinical samples\n",
        "The blue lines are the minimum cq value for the negative controls\n",
        "The green lines are the average of the two smallest cq values for the negative controls (= the detection threshold)\n",
        "The red lines are the maximum cq value for the standards"
      )
  )

Adjusted copies per swabs

We create a new column copies_per_swab_adj that is set to 0 if the cq_imp is above the threshold.

Code
qpcr <- 
  qpcr |> 
  mutate(
    copies_per_swab_adj = 
      case_when(
        (target != "16S") & (cq_imp >= threshold_cq) ~ 0,
        TRUE ~ copies_per_swab
      )
  )
Code
qpcr |> 
  filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(qpcr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
  mutate(
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab_adj, na.rm = TRUE),
    mean_cps = mean(copies_per_swab_adj, na.rm = TRUE)
    ) |> 
  ungroup() |> 
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = copies_per_swab_adj, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_line(aes(group = qpcr_sample_id), col = "black", linewidth = 0.1) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

This does not fully solve the problem, but I hope that at the “aggregation” step, taking the median value would remove a lot of false positive.

If not, a more stringent threshold could potentially be used, or these samples would have to be excluded from the analysis.

Aggregation across replicates

Code
qpcr <- 
  qpcr |> 
  group_by(qpcr_sample_id, target) |> 
  mutate(
    copies_per_swab_raw_median = median(copies_per_swab, na.rm = TRUE),
    copies_per_swab_adj_median = median(copies_per_swab_adj, na.rm = TRUE),
    copies_per_swab_adj_range = max(copies_per_swab_adj, na.rm = TRUE) - min(copies_per_swab_adj, na.rm = TRUE),
  ) |> 
  ungroup()
Code
qpcr |> 
  filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
  group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_raw_median > 0)) |> ungroup() |>
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_raw_median |> log10()) +
  geom_tile() +
  facet_grid(strain_group_qpcr + LBP  ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
  theme(
    axis.text.x = element_blank(),
    legend.position = "bottom"
  ) +
  ggtitle("Median of the original `copies_per_swabs`")

Code
qpcr |> 
  filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
  group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_adj_median > 0)) |> ungroup() |>
  mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_adj_median |> log10()) +
  geom_tile() +
  facet_grid(strain_group_qpcr + LBP  ~ ext_lib_plate_nb, scales = "free", space = "free") +
  scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
  theme(
    axis.text.x = element_blank(),
    legend.position = "bottom"
  ) +
  ggtitle("Median of the adjusted `copies_per_swabs`")

The threshold correction likely helped in reducing the number of false positive.

Longitudinal profiles

When looking at the longitudinal patterns for randomized participants, we do not notice an over-representation of a specific plate or group in the “isolated” positives:

Code
qpcr |> 
  filter(
    sample_type %in% c("Clinical sample"), 
    sample_category %in% c("Expected sample"),
    visit_code %in% c(seq(1000, 1900, by = 100), 2120)
    ) |>
  group_by(pid) |> mutate(n_target_detected = sum(copies_per_swab_adj_median > 0)) |> ungroup() |>
  mutate(pid = pid |> fct_reorder(n_target_detected)) |>
  ggplot() +
  aes(x = pid, y = visit_code, fill = ext_lib_plate_nb |> factor(), alpha = copies_per_swab_adj_median |> log10()) +
  geom_tile() +
  facet_grid(strain_group_qpcr + target ~ ., scales = "free") +
  scale_fill_discrete("Extraction plate number") +
  scale_alpha_continuous(limits = c(0, 10)) +
  theme(
    axis.text.x = element_blank(),
    strip.text.y = element_text(angle = 0),
    legend.position = "bottom"
  )

We should thus expect some false positive in the LBP qPCR data :)

Code
qpcr |> 
  filter(
    sample_type %in% c("Clinical sample"), 
    sample_category %in% c("Expected sample"),
    visit_code %in% c(seq(1000, 1900, by = 100), 2120),
    target == "16S"
    ) |>  
  arrange(pid, visit_code) |> 
  select(pid, visit_code, copies_per_swab_adj_median) |>
  distinct() |> 
  ggplot() +
  aes(x = visit_code, y = copies_per_swab_adj_median, col = pid) +
  geom_boxplot(alpha = 0.2, fill = "dodgerblue1", color = "dodgerblue1") +
  geom_line(aes(group = pid), alpha = 0.2) +
  # geom_point(size = 0.5) +
  ggbeeswarm::geom_quasirandom(width = 0.2) +
  scale_y_log10() +
  guides(col = "none") +
  ggtitle("16S copies per swab") 

Creating SummarizedExperiment objects

We create two Summarized Experiment objects:

  • one with all values as provided in the original file, where each “sample” is a plate well and each feature is a target (LBP taxa)

  • one with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including “biological” control samples) and each feature is a target (LBP taxa)

Raw SummarizedExperiment object

We create a SummarizedExperiment object with the following assays:

  • dilution_factor
  • cq
  • starting_quantity
  • cq_mean
  • quant_adjusted
  • copies_per_swab_raw (current copies_per_swab column)
  • copies_per_swab (current copies_per_swab_adj column)

The “samples” are defined by the qpcr_uid column (one per well and plate) and the “features” are defined by the target column (i.e., the LBP taxa).

The colData contains the following columns:

  • qpcr_uid (the qPCR specific unique identifier)
  • uid (the VIBRANT cross-assay unique sample identifier)
  • pid (the participant ID)
  • visit_code (the visit code)
  • qpcr_sample_id
  • vibr_sample_id
  • qpcr_sample_type
  • sample_type
  • control_type
  • ext_lib_plate_nb
  • ext_lib_plate_id

The rowData contains the following columns:

  • fluor
  • strain_group_qpcr
  • pcr_plate_id (the concatenation of pcr_plate_id for each ext_lib_plate_nb)
  • taxon_label
  • LBP
  • strain_id
  • strain_origin
  • biose_id
Code
qpcr |> dplyr::count(qpcr_uid) |> dplyr::count(n)

qpcr |> dplyr::count(qpcr_uid) |> nrow()


qpcr |> dplyr::count(qpcr_uid) |> filter(n != 16) |> View()
Code
make_raw_qpcr_SE <- function(qpcr){
  
  qpcr <- 
    qpcr |> 
    arrange(ext_lib_plate_nb, well_col, well_row) |>
    mutate(qpcr_uid = qpcr_uid |> fct_inorder()) |> 
    arrange(strain_group_qpcr, fluor) |> 
    mutate(target = target |> fct_inorder()) |> 
    arrange(qpcr_uid, target)
    
  # ASSAYS
  
  dilution_assay <- 
    qpcr |>
    select(qpcr_uid, target, dilution_factor) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = dilution_factor) |> 
    as.data.frame() |> 
    column_to_rownames("target") 
  
  cq_assay <- 
    qpcr |>
    select(qpcr_uid, target, cq) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = cq) |> 
    as.data.frame() |> 
    column_to_rownames("target") 
  
  starting_quantity_assay <- 
    qpcr |>
    select(qpcr_uid, target, starting_quantity) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = starting_quantity) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
   cq_mean_assay <- 
    qpcr |>
    select(qpcr_uid, target, cq_mean) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = cq_mean) |> 
    as.data.frame() |> 
    column_to_rownames("target")
   
   quant_adjusted_assay <- 
     qpcr |>
     select(qpcr_uid, target, quant_adjusted) |> 
     arrange() |> 
     pivot_wider(names_from = qpcr_uid, values_from = quant_adjusted) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
   copies_per_swab_raw_assay <- 
     qpcr |>
     select(qpcr_uid, target, copies_per_swab) |> 
     arrange() |> 
     pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
  copies_per_swab_assay <- 
    qpcr |>
    select(qpcr_uid, target, copies_per_swab_adj) |> 
    arrange() |> 
    pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab_adj) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
    # COLDATA
   coldata <- 
     qpcr |> 
     select(
       qpcr_uid, 
       uid, pid, visit_code, location,
       qpcr_sample_type,  sample_type, control_type, 
       qpcr_sample_id, vibr_sample_id, replicate_nb, 
       ext_lib_plate_nb, ext_lib_plate_id
       ) |> 
     distinct() |> 
     arrange(qpcr_uid) |> 
     as.data.frame() |> 
     mutate(rownames = qpcr_uid) |> 
     column_to_rownames("rownames") 
  
   
    # ROWDATA
   rowdata <-
     qpcr |> 
     select(
       target, fluor, strain_group_qpcr, 
       pcr_plate_id, ext_lib_plate_nb, 
       taxon_label, LBP, strain_id, strain_origin, biose_id
       ) |> 
     distinct() |> 
     group_by(
       target, fluor, strain_group_qpcr, 
       taxon_label, LBP, strain_id, strain_origin, biose_id
       ) |> 
     arrange(target, ext_lib_plate_nb) |> 
     mutate(
       pcr_plate_id = str_c(pcr_plate_id, " (extr. plate ", ext_lib_plate_nb |> str_pad(width = 2, pad = "0"), ")", sep = ""),
     ) |> 
     summarize(
       pcr_plate_ids = pcr_plate_id |> unique() |> str_c(collapse = ", "),
       .groups = "drop"
     ) |> 
     ungroup() |> 
     as.data.frame() |> 
     mutate(rownames = target) |> 
     column_to_rownames("rownames")

  
   
   SE <- SummarizedExperiment(
     assays = list(
       dilution = dilution_assay |> as.matrix(),
       cq = cq_assay |> as.matrix(),
       cq_mean = cq_mean_assay |> as.matrix(),
       starting_quantity = starting_quantity_assay |> as.matrix(),
       quant_adjusted = quant_adjusted_assay |> as.matrix(),
       copies_per_swab_raw = copies_per_swab_raw_assay |>  as.matrix(),
       copies_per_swab = copies_per_swab_assay |> as.matrix()
     ),
     colData = coldata,
     rowData = rowdata,
     metadata = list(
       description = "Raw qPCR data from the VIBRANT study",
       date = today(),
       assay_and_coldata_dictionary = dictionary
     )
   )
   
   SE
  
}
Code
SE_qPCR_raw <- make_raw_qpcr_SE(qpcr)
SE_qPCR_raw
# A SummarizedExperiment-tibble abstraction: 58,032 × 31
# Features=16 | Samples=3627 | Assays=dilution, cq, cq_mean, starting_quantity,
#   quant_adjusted, copies_per_swab_raw, copies_per_swab
   .feature .sample      dilution    cq cq_mean starting_quantity quant_adjusted
   <chr>    <chr>           <dbl> <dbl>   <dbl>             <dbl>          <dbl>
 1 C0175A1  std_01_plat…       10  15.3    15.2          10000000      100000000
 2 C0022A1  std_01_plat…       10  16.0    16.0          10000000      100000000
 3 C0059E1  std_01_plat…       10  15.4    15.4          10000000      100000000
 4 UC101    std_01_plat…       10  13.9    13.9          10000000      100000000
 5 FF00018  std_01_plat…       10  14.6    14.6          10000000      100000000
 6 FF00051  std_01_plat…       10  14.5    14.6          10000000      100000000
 7 185329   std_01_plat…       30  14.0    13.9          10000000      300000000
 8 C0028A1  std_01_plat…       30  14.1    14.0          10000000      300000000
 9 C0112A1  std_01_plat…       30  14.6    14.6          10000000      300000000
10 FF00004  std_01_plat…       10  14.6    14.6          10000000      100000000
# ℹ 310 more rows
# ℹ 24 more variables: copies_per_swab_raw <dbl>, copies_per_swab <dbl>,
#   qpcr_uid <fct>, uid <chr>, pid <chr>, visit_code <chr>, location <chr>,
#   qpcr_sample_type <fct>, sample_type <chr>, control_type <chr>,
#   qpcr_sample_id <chr>, vibr_sample_id <chr>, replicate_nb <int>,
#   ext_lib_plate_nb <dbl>, ext_lib_plate_id <dbl>, target <fct>, fluor <chr>,
#   strain_group_qpcr <chr>, taxon_label <chr>, LBP <fct>, strain_id <fct>, …

Aggregated SummarizedExperiment object

We create a SummarizedExperiment object with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including control samples) and each feature is a target (LBP taxa), with the following assays

  • dilution
  • copies_per_swab_med (the median copies_per_swab across replicates)
  • copies_per_swab_mean (the mean copies_per_swab across replicates)
  • copies_per_swab_cv (the coefficient of variation of copies_per_swab across replicates)

The colData contains the same columns as above (except for those that are replicate-specific)

The rowData contains the same columns as above,

Code
agg_qpcr_SE <- function(SE_qPCR_raw){
  
  filtered_qpcr <- 
    SE_qPCR_raw |> 
    filter(
      !(qpcr_sample_type %in% c("Standard", "Water", "Empty")),
      !is.na(uid) 
      ) 
    
  summarized_qpcr <- 
    filtered_qpcr |>
    as_tibble() |> 
    group_by(uid, target) |> 
    summarize(
      dilution = mean(dilution, na.rm = TRUE),
      copies_per_swab_med = median(copies_per_swab),
      copies_per_swab_mean = mean(copies_per_swab),
      copies_per_swab_sd = sd(copies_per_swab),
      copies_per_swab_range = max(copies_per_swab) - min(copies_per_swab),
      .groups = "drop"
    ) |> 
    mutate(
      copies_per_swab_cv = ifelse(copies_per_swab_sd == 0, 0, copies_per_swab_sd / copies_per_swab_mean)
    ) 
  
    # COLDATA
   coldata <- 
     filtered_qpcr |> 
     colData() |> 
     as.data.frame() |> 
     as_tibble() |> 
     select(-qpcr_uid, -replicate_nb) |> 
     distinct() |>  
     arrange(uid) |> 
     as.data.frame() |> 
     mutate(rownames = uid) |> 
     column_to_rownames("rownames")
   
  # ROWDATA
   rowdata <-
     filtered_qpcr |> 
     rowData() |> 
     as.data.frame()
    
   SE <- SummarizedExperiment(
     assays = list(
        dilution = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = dilution) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_med = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_med) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_mean = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_mean) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_cv = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_cv) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix()
     ),
     colData = coldata,
     rowData = rowdata,
     metadata = list(
       description = "Aggregated qPCR data from the VIBRANT study",
       date = today(),
       assay_and_coldata_dictionary = SE_qPCR_raw@metadata$assay_and_coldata_dictionary
     )
   )
   
   SE
     
}
Code
SE_qPCR_agg <- agg_qpcr_SE(SE_qPCR_raw = SE_qPCR_raw)
SE_qPCR_agg
# A SummarizedExperiment-tibble abstraction: 16,224 × 26
# Features=16 | Samples=1014 | Assays=dilution, copies_per_swab_med,
#   copies_per_swab_mean, copies_per_swab_cv
   .feature .sample        dilution copies_per_swab_med copies_per_swab_mean
   <chr>    <chr>             <dbl>               <dbl>                <dbl>
 1 C0175A1  068100004_0000        5                   0                    0
 2 C0022A1  068100004_0000        5                   0                    0
 3 C0059E1  068100004_0000        5                   0                    0
 4 UC101    068100004_0000        5                   0                    0
 5 FF00018  068100004_0000        5                   0                    0
 6 FF00051  068100004_0000        5                   0                    0
 7 185329   068100004_0000       20                   0                    0
 8 C0028A1  068100004_0000       20                   0                    0
 9 C0112A1  068100004_0000       20                   0                    0
10 FF00004  068100004_0000       20                   0                    0
# ℹ 310 more rows
# ℹ 21 more variables: copies_per_swab_cv <dbl>, uid <chr>, pid <chr>,
#   visit_code <chr>, location <chr>, qpcr_sample_type <fct>,
#   sample_type <chr>, control_type <chr>, qpcr_sample_id <chr>,
#   vibr_sample_id <chr>, ext_lib_plate_nb <dbl>, ext_lib_plate_id <dbl>,
#   target <fct>, fluor <chr>, strain_group_qpcr <chr>, taxon_label <chr>,
#   LBP <fct>, strain_id <fct>, strain_origin <fct>, biose_id <chr>, …

Save SummarizedExperiment objects

Save the SE objects to disk

Code
saveRDS(
  SE_qPCR_raw, 
  str_c(
    get_output_dir(data_source = data_source),  
    "03_se_pcr_raw_", today() |> str_remove_all("-"), ".rds"
    )
  )


saveRDS(
  SE_qPCR_agg, 
  str_c(
    get_output_dir(data_source = data_source),  
    "03_se_pcr_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )